Using S&P500 data that I've pulled in a different python script, evaluate different features for correlation to stock price change, and use this for my equity trend dash. There is still more work to do here - additional features to look at, and something more elegant that the gradient boosted tree algo from SKLearn.
import requests
import pandas as pd
import pickle
import os
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import time
import numpy as np
import plotly.io as pio
from tqdm import tqdm
from datetime import datetime
from dateutil.relativedelta import relativedelta
import seaborn as sn
from sklearn.cluster import KMeans
import warnings
df_sp500 = pd.read_csv('sp500_tickers.csv')
tickers = df_sp500['Symbol'].values
df = pd.read_pickle('sp500_earn_balance_close.pkl')
df[df.columns[5:]] = df[df.columns[5:]].apply(pd.to_numeric, errors='coerce')
df
| fiscalDateEnding | reportedEPS | eps_chng | ticker | reportedCurrency | totalAssets | totalCurrentAssets | cashAndCashEquivalentsAtCarryingValue | cashAndShortTermInvestments | inventory | ... | otherNonCurrentLiabilities | totalShareholderEquity | treasuryStock | retainedEarnings | commonStock | commonStockSharesOutstanding | 1D_close_after | 60D_close_after | 60D_close_chng | 1D_close_chng | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-12-31 | 3.56 | 3.450000 | BAC | USD | 3169495000000 | 363856000000 | 348221000000 | 348221000000 | 130000000.0 | ... | 1.923360e+11 | 270066000000 | NaN | 1.880640e+11 | 6.239800e+10 | 8077831463 | 49.18 | 45.02 | -0.084587 | 0.090707 |
| 1 | 2020-12-31 | 1.87 | -0.474719 | BAC | USD | 2819627000000 | 389706000000 | 380463000000 | 380463000000 | 164000000.0 | ... | 1.703390e+11 | 272924000000 | NaN | 1.640880e+11 | 8.598200e+10 | 8650814105 | 32.53 | 34.71 | 0.067015 | 0.067958 |
| 2 | 2019-12-31 | 2.93 | 0.566845 | BAC | USD | 2434079000000 | 170760000000 | 161560000000 | 161560000000 | 285000000.0 | ... | 1.651310e+11 | 264810000000 | NaN | 1.563190e+11 | 9.172300e+10 | 8836148954 | 34.90 | 28.50 | -0.183381 | -0.018836 |
| 3 | 2018-12-31 | 2.61 | -0.109215 | BAC | USD | 2354507000000 | 187771000000 | 177404000000 | 177404000000 | 300000000.0 | ... | 1.870180e+11 | 265325000000 | NaN | 1.363140e+11 | 1.188960e+11 | 9669286370 | 25.58 | 29.31 | 0.145817 | 0.041107 |
| 4 | 2017-12-31 | 1.82 | -0.302682 | BAC | USD | 2281234000000 | 378222000000 | 157434000000 | 366792000000 | 236000000.0 | ... | 1.769020e+11 | 267146000000 | NaN | 1.138160e+11 | 1.380890e+11 | 10287302431 | 30.33 | 32.03 | 0.056050 | 0.019496 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | 2021-12-31 | -8.08 | 3.439560 | NCLH | USD | 18729837000 | 3301568000 | 1506647000 | 1746647000 | 118205000.0 | ... | 9.970550e+08 | 2432650000 | NaN | -4.796406e+09 | 4.170000e+05 | 416891915 | 22.20 | 19.65 | -0.114865 | 0.058151 |
| 1 | 2020-12-31 | -8.45 | 0.045792 | NCLH | USD | 18399317000 | 3557544000 | 3300482000 | 3300482000 | 82381000.0 | ... | 4.500750e+08 | 4354105000 | NaN | -2.954490e+08 | 3.160000e+05 | 315636032 | 24.43 | 29.56 | 0.209988 | -0.036292 |
| 2 | 2019-12-31 | 5.09 | -1.602367 | NCLH | USD | 16684599000 | 730145000 | 252876000 | 252876000 | 95427000.0 | ... | 5.292950e+08 | 6515579000 | 1.253926e+09 | 3.829068e+09 | 2.370000e+05 | 213082411 | 57.60 | 37.26 | -0.353125 | -0.025381 |
| 3 | 2018-12-31 | 4.93 | -0.031434 | NCLH | USD | 15205970000 | 550313000 | 163851000 | 163851000 | 90202000.0 | ... | 2.815960e+08 | 5963001000 | 9.040660e+08 | 2.898840e+09 | 2.350000e+05 | 217650644 | 43.03 | 55.27 | 0.284453 | 0.017498 |
| 4 | 2017-12-31 | 3.96 | -0.196755 | NCLH | USD | 14094869000 | 518337000 | 176190000 | 176190000 | 82121000.0 | ... | 1.666900e+08 | 5749766000 | 2.392550e+08 | 1.963128e+09 | 2.330000e+05 | 228528562 | 54.87 | 57.90 | 0.055221 | 0.020647 |
2293 rows × 45 columns
This needs is over the total period available in the data, should look at period to period changes in the future.
# lets take a look at SP Stocks that have had the largest increases over time
norm_cols = list(df.columns[5:41])
price_perf_list = []
for t in tickers:
df_sub_list = df[df['ticker']==t]
if not df_sub_list.empty:
# price performance
df_sub_list = df_sub_list.sort_values(['fiscalDateEnding'],ascending=True)
start_price = df_sub_list.iloc[0]['1D_close_after']
end_price = df_sub_list.iloc[-1]['1D_close_after']
duration = ((df_sub_list.iloc[-1]['fiscalDateEnding']-df_sub_list.iloc[0]['fiscalDateEnding']).days)/365
price_perf = (end_price - start_price) / start_price
price_perf_py = (price_perf) / duration
itr_dict = {'ticker':t,
'start_yr':df_sub_list.iloc[0]['fiscalDateEnding'],
'end_yr':df_sub_list.iloc[-1]['fiscalDateEnding'],
'duration': duration,
'start_price':start_price,
'end_price':end_price,
'price_perf':price_perf,
'price_perf_per_yr':price_perf_py}
# column change
for c in norm_cols:
c_chng = (df_sub_list.iloc[-1][c] - df_sub_list.iloc[0][c])/df_sub_list.iloc[0][c]
key_name = c + "_change"
itr_dict[key_name] = c_chng
price_perf_list.append(itr_dict)
chng_df = pd.DataFrame(price_perf_list)
chng_df = chng_df.sort_values(['price_perf_per_yr'],ascending=False)
chng_df.iloc[:10]
C:\Users\Korey\AppData\Local\Temp/ipykernel_440/1534687591.py:29: RuntimeWarning: divide by zero encountered in double_scalars c_chng = (df_sub_list.iloc[-1][c] - df_sub_list.iloc[0][c])/df_sub_list.iloc[0][c] C:\Users\Korey\AppData\Local\Temp/ipykernel_440/1534687591.py:29: RuntimeWarning: invalid value encountered in double_scalars c_chng = (df_sub_list.iloc[-1][c] - df_sub_list.iloc[0][c])/df_sub_list.iloc[0][c] C:\Users\Korey\AppData\Local\Temp/ipykernel_440/1534687591.py:16: RuntimeWarning: invalid value encountered in double_scalars price_perf_py = (price_perf) / duration C:\Users\Korey\AppData\Local\Temp/ipykernel_440/1534687591.py:29: RuntimeWarning: divide by zero encountered in longlong_scalars c_chng = (df_sub_list.iloc[-1][c] - df_sub_list.iloc[0][c])/df_sub_list.iloc[0][c]
| ticker | start_yr | end_yr | duration | start_price | end_price | price_perf | price_perf_per_yr | totalAssets_change | totalCurrentAssets_change | ... | currentLongTermDebt_change | longTermDebtNoncurrent_change | shortLongTermDebtTotal_change | otherCurrentLiabilities_change | otherNonCurrentLiabilities_change | totalShareholderEquity_change | treasuryStock_change | retainedEarnings_change | commonStock_change | commonStockSharesOutstanding_change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 290 | ENPH | 2017-12-31 | 2021-12-31 | 4.00274 | 2.47 | 145.13 | 57.757085 | 14.429388 | 11.292598 | 10.211237 | ... | 3.937288 | 28.441062 | 19.856787 | 4.806956 | 7.792553 | -48.136533 | NaN | 0.371998 | 0.000000 | 0.558465 |
| 131 | MRNA | 2018-12-31 | 2021-12-31 | 3.00274 | 16.96 | 213.86 | 11.609670 | 3.866359 | 11.212606 | 9.282110 | ... | NaN | NaN | 26.352265 | 23.113171 | 293.573643 | 8.243642 | NaN | -10.892443 | -1.000000 | 0.225673 |
| 49 | AMD | 2017-12-31 | 2021-12-31 | 4.00274 | 11.88 | 132.00 | 10.111111 | 2.526048 | 2.496340 | 2.258542 | ... | 3.457143 | -0.999245 | -0.775627 | 0.065217 | 1.720339 | 11.578859 | 18.722222 | -0.813376 | 0.333333 | 0.248190 |
| 424 | ETSY | 2017-12-31 | 2021-12-31 | 4.00274 | 20.25 | 185.15 | 8.143210 | 2.034409 | 5.327471 | 2.053974 | ... | NaN | NaN | inf | 6.218621 | 5.703373 | 0.583846 | NaN | -1.745083 | 0.040984 | 0.043138 |
| 202 | DXCM | 2017-12-31 | 2021-12-31 | 4.00274 | 53.26 | 471.49 | 7.852610 | 1.961809 | 4.456587 | 3.947496 | ... | NaN | 4.197497 | 5.469225 | 6.617978 | 3.945055 | 3.869099 | inf | -1.206460 | 0.000000 | 0.114943 |
| 167 | FTNT | 2017-12-31 | 2021-12-31 | 4.00274 | 44.10 | 314.66 | 6.135147 | 1.532737 | 1.621507 | 1.097274 | ... | NaN | NaN | NaN | 1.400945 | 5.883721 | 0.298505 | NaN | 0.464109 | 0.000000 | -0.035082 |
| 203 | MCHP | 2020-03-31 | 2021-03-31 | 1.00000 | 64.04 | 160.17 | 1.501093 | 1.501093 | -0.054361 | -0.032473 | ... | 1.172963 | -0.145626 | -0.060967 | -0.034919 | 0.216390 | -0.044472 | -0.133440 | -0.011333 | 1.500000 | 0.114961 |
| 398 | GNRC | 2017-12-31 | 2021-12-31 | 4.00274 | 50.41 | 316.55 | 5.279508 | 1.318974 | 1.407633 | 1.241766 | ... | 0.838675 | -0.056678 | 0.965274 | 2.034248 | NaN | 2.995686 | 0.527103 | 2.218469 | 0.024011 | 0.017999 |
| 88 | GE | 2017-12-31 | 2021-12-31 | 4.00274 | 18.54 | 101.40 | 4.469256 | 1.116549 | -0.461404 | -0.387057 | ... | -0.823119 | NaN | -0.738571 | -0.646599 | -0.789761 | -0.280577 | -0.044863 | -0.274084 | -0.978632 | -0.873392 |
| 194 | CMG | 2017-12-31 | 2021-12-31 | 4.00274 | 313.79 | 1597.81 | 4.091972 | 1.022293 | 2.252180 | 1.194579 | ... | NaN | NaN | NaN | 0.882263 | -0.037611 | 0.683742 | 0.437667 | 0.639150 | 0.033426 | -0.001243 |
10 rows × 44 columns
chng_df['class'] = 0
chng_df = chng_df[chng_df['duration']>=2]
chng_df.loc[chng_df['ticker'].isin(list(chng_df.iloc[0:25]['ticker'].values)),'class'] = 3
chng_df.loc[chng_df['ticker'].isin(list(chng_df.iloc[26:74]['ticker'].values)),'class'] = 2
chng_df.loc[chng_df['ticker'].isin(list(chng_df.iloc[75:100]['ticker'].values)),'class'] = 1
chng_df[chng_df['class']==3]
| ticker | start_yr | end_yr | duration | start_price | end_price | price_perf | price_perf_per_yr | totalAssets_change | totalCurrentAssets_change | ... | longTermDebtNoncurrent_change | shortLongTermDebtTotal_change | otherCurrentLiabilities_change | otherNonCurrentLiabilities_change | totalShareholderEquity_change | treasuryStock_change | retainedEarnings_change | commonStock_change | commonStockSharesOutstanding_change | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 290 | ENPH | 2017-12-31 | 2021-12-31 | 4.00274 | 2.47 | 145.13 | 57.757085 | 14.429388 | 11.292598 | 10.211237 | ... | 28.441062 | 19.856787 | 4.806956 | 7.792553 | -48.136533 | NaN | 0.371998 | 0.000000 | 0.558465 | 3 |
| 131 | MRNA | 2018-12-31 | 2021-12-31 | 3.00274 | 16.96 | 213.86 | 11.609670 | 3.866359 | 11.212606 | 9.282110 | ... | NaN | 26.352265 | 23.113171 | 293.573643 | 8.243642 | NaN | -10.892443 | -1.000000 | 0.225673 | 3 |
| 49 | AMD | 2017-12-31 | 2021-12-31 | 4.00274 | 11.88 | 132.00 | 10.111111 | 2.526048 | 2.496340 | 2.258542 | ... | -0.999245 | -0.775627 | 0.065217 | 1.720339 | 11.578859 | 18.722222 | -0.813376 | 0.333333 | 0.248190 | 3 |
| 424 | ETSY | 2017-12-31 | 2021-12-31 | 4.00274 | 20.25 | 185.15 | 8.143210 | 2.034409 | 5.327471 | 2.053974 | ... | NaN | inf | 6.218621 | 5.703373 | 0.583846 | NaN | -1.745083 | 0.040984 | 0.043138 | 3 |
| 202 | DXCM | 2017-12-31 | 2021-12-31 | 4.00274 | 53.26 | 471.49 | 7.852610 | 1.961809 | 4.456587 | 3.947496 | ... | 4.197497 | 5.469225 | 6.617978 | 3.945055 | 3.869099 | inf | -1.206460 | 0.000000 | 0.114943 | 3 |
| 167 | FTNT | 2017-12-31 | 2021-12-31 | 4.00274 | 44.10 | 314.66 | 6.135147 | 1.532737 | 1.621507 | 1.097274 | ... | NaN | NaN | 1.400945 | 5.883721 | 0.298505 | NaN | 0.464109 | 0.000000 | -0.035082 | 3 |
| 398 | GNRC | 2017-12-31 | 2021-12-31 | 4.00274 | 50.41 | 316.55 | 5.279508 | 1.318974 | 1.407633 | 1.241766 | ... | -0.056678 | 0.965274 | 2.034248 | NaN | 2.995686 | 0.527103 | 2.218469 | 0.024011 | 0.017999 | 3 |
| 88 | GE | 2017-12-31 | 2021-12-31 | 4.00274 | 18.54 | 101.40 | 4.469256 | 1.116549 | -0.461404 | -0.387057 | ... | NaN | -0.738571 | -0.646599 | -0.789761 | -0.280577 | -0.044863 | -0.274084 | -0.978632 | -0.873392 | 3 |
| 194 | CMG | 2017-12-31 | 2021-12-31 | 4.00274 | 313.79 | 1597.81 | 4.091972 | 1.022293 | 2.252180 | 1.194579 | ... | NaN | NaN | 0.882263 | -0.037611 | 0.683742 | 0.437667 | 0.639150 | 0.033426 | -0.001243 | 3 |
| 370 | EPAM | 2017-12-31 | 2021-12-31 | 4.00274 | 110.21 | 556.68 | 4.051084 | 1.012078 | 1.818004 | 1.363960 | ... | NaN | -0.360125 | 2.934878 | 4.138315 | 1.551028 | 0.000000 | 2.526333 | 0.075472 | 0.072953 | 3 |
| 335 | ZBRA | 2017-12-31 | 2021-12-31 | 4.00274 | 109.54 | 530.86 | 3.846266 | 0.960908 | 0.453801 | 0.588899 | ... | -0.576287 | -0.086857 | 0.626316 | -0.427350 | 2.577938 | 0.650000 | 1.862981 | 0.000000 | 0.004569 | 3 |
| 134 | FCX | 2018-12-31 | 2021-12-31 | 3.00274 | 10.82 | 41.88 | 2.870610 | 0.955997 | 0.137531 | 0.417240 | ... | -0.183927 | -0.143323 | 1.014851 | -0.245291 | 0.426822 | 0.151596 | -0.387509 | 0.012658 | 0.011034 | 3 |
| 266 | CPRT | 2017-07-31 | 2021-07-31 | 4.00274 | 31.30 | 145.99 | 3.664217 | 0.915427 | 1.301206 | 1.899241 | ... | NaN | -0.366086 | 20.713489 | NaN | 2.212453 | NaN | 2.848026 | 0.043478 | 0.028314 | 3 |
| 111 | LRCX | 2017-06-30 | 2021-06-30 | 4.00274 | 148.13 | 631.44 | 3.262742 | 0.815127 | 0.310935 | 0.274530 | ... | 1.788874 | 0.846508 | 1.829624 | 0.423080 | -0.115918 | 1.999643 | 1.349702 | -0.117284 | -0.118858 | 3 |
| 362 | PAYC | 2017-12-31 | 2021-12-31 | 4.00274 | 83.88 | 355.80 | 3.241774 | 0.809889 | 1.074102 | 0.914567 | ... | -0.204394 | -0.174126 | 0.746847 | NaN | 2.177684 | 2.494087 | 2.541549 | 0.036606 | 0.003866 | 3 |
| 77 | NOW | 2017-12-31 | 2021-12-31 | 4.00274 | 135.01 | 566.39 | 3.195171 | 0.798246 | 2.041480 | 1.155287 | ... | 1.355488 | -0.367669 | NaN | -0.225912 | 3.744820 | NaN | NaN | -1.000000 | 0.145311 | 3 |
| 217 | MSCI | 2017-12-31 | 2021-12-31 | 4.00274 | 133.03 | 552.00 | 3.149440 | 0.786821 | 0.681093 | 0.691548 | ... | 1.002520 | -191.019269 | 0.780698 | -0.060732 | -1.407636 | 0.955282 | 0.977484 | 0.028571 | -0.085072 | 3 |
| 271 | KEYS | 2017-10-31 | 2021-10-31 | 4.00274 | 44.78 | 183.68 | 3.101831 | 0.774927 | 0.311478 | 0.761139 | ... | -0.121197 | -0.125488 | 0.193548 | -0.011236 | 0.638095 | 21.983871 | 2.294909 | 0.000000 | -0.005376 | 3 |
| 2 | MSFT | 2017-06-30 | 2021-06-30 | 4.00274 | 69.46 | 277.65 | 2.997265 | 0.748803 | 0.333452 | 0.133439 | ... | -0.341764 | -0.325233 | 0.518220 | 2.783319 | 0.618816 | NaN | 2.210929 | 0.199033 | -0.024520 | 3 |
| 307 | MTCH | 2017-12-31 | 2021-12-31 | 4.00274 | 31.80 | 125.27 | 2.939308 | 0.734324 | -0.137108 | -0.426760 | ... | 0.934570 | 1.606786 | 0.083110 | 2.035680 | -1.083855 | NaN | -14.687385 | 0.021661 | 2.446490 | 3 |
| 171 | SNPS | 2017-10-31 | 2021-10-31 | 4.00274 | 86.92 | 340.75 | 2.920272 | 0.729568 | 0.621866 | 0.669055 | ... | -0.812819 | -0.479175 | 0.373434 | 0.553139 | 0.616530 | 0.836817 | 1.122193 | 0.017276 | 0.017395 | 3 |
| 67 | INTU | 2017-07-31 | 2021-07-31 | 4.00274 | 136.90 | 534.29 | 2.902776 | 0.725197 | 2.814159 | 2.644523 | ... | NaN | 3.675799 | 1.028090 | -0.592308 | 4.808711 | 0.201614 | 0.685076 | 0.000000 | 0.068710 | 3 |
| 302 | WST | 2017-12-31 | 2021-12-31 | 4.00274 | 100.84 | 390.47 | 2.872174 | 0.717552 | 0.778935 | 1.342972 | ... | 0.059898 | 0.284264 | 0.328455 | -0.602083 | 0.824674 | 1.103575 | 1.085130 | 0.000000 | 0.004060 | 3 |
| 405 | SEDG | 2019-12-31 | 2021-12-31 | 2.00274 | 102.38 | 248.91 | 1.431237 | 0.714639 | 0.934975 | 0.834647 | ... | 7184.375723 | 58.559839 | 0.357464 | -0.092466 | 0.614004 | NaN | 0.925060 | 0.000000 | 0.080112 | 3 |
| 395 | TECH | 2017-06-30 | 2021-06-30 | 4.00274 | 117.68 | 449.57 | 2.820275 | 0.704586 | 0.452271 | 0.467153 | ... | NaN | 0.201916 | -0.945250 | 3.527485 | 0.645879 | NaN | 0.358478 | 0.042781 | 0.042816 | 3 |
25 rows × 45 columns
corr_matrix = chng_df.corr()
corr_matrix = corr_matrix[['price_perf_per_yr','price_perf']].sort_values(by='price_perf')
corr_matrix = corr_matrix[(corr_matrix['price_perf_per_yr']>.1) | (corr_matrix['price_perf_per_yr']<.1)]
corr_matrix = corr_matrix[(corr_matrix['price_perf']>.1) | (corr_matrix['price_perf']<=.1)]
sn.set(rc = {'figure.figsize':(15,18)})
sn.heatmap(corr_matrix, annot=True)
<AxesSubplot:>
fig = px.scatter(chng_df,
x='totalShareholderEquity_change',
y='price_perf',
color='class',
color_continuous_scale='Burg',
trendline='ols')
fig.show()
chng_df_tse = chng_df.copy()
chng_df_tse = chng_df_tse[(chng_df_tse['totalShareholderEquity_change'] > -10)]
fig = px.scatter(chng_df_tse,
x='totalShareholderEquity_change',
y='price_perf',
color='class',
color_continuous_scale='Burg',
trendline='ols')
fig.show()
corr_matrix = chng_df_tse.corr()
corr_matrix = corr_matrix[['price_perf_per_yr','price_perf']].sort_values(by='price_perf')
corr_matrix = corr_matrix[(corr_matrix['price_perf_per_yr']>.1) | (corr_matrix['price_perf_per_yr']<.1)]
corr_matrix = corr_matrix[(corr_matrix['price_perf']>.1) | (corr_matrix['price_perf']<=.1)]
corr_matrix.loc['totalShareholderEquity_change']
price_perf_per_yr 0.307735 price_perf 0.300332 Name: totalShareholderEquity_change, dtype: float64
fig = px.scatter(chng_df,
x='intangibleAssets_change',
y='price_perf',
color='class',
color_continuous_scale='Burg')
fig.show()
chng_df_ia = chng_df.copy()
chng_df_ia = chng_df_ia[(chng_df_ia['intangibleAssets_change'] < 20)]
fig = px.scatter(chng_df_ia,
x='intangibleAssets_change',
y='price_perf',
color='class',
color_continuous_scale='Burg',
trendline='ols')
fig.show()
corr_matrix = chng_df_ia.corr()
corr_matrix = corr_matrix[['price_perf_per_yr','price_perf']].sort_values(by='price_perf')
corr_matrix = corr_matrix[(corr_matrix['price_perf_per_yr']>.1) | (corr_matrix['price_perf_per_yr']<.1)]
corr_matrix = corr_matrix[(corr_matrix['price_perf']>.1) | (corr_matrix['price_perf']<=.1)]
corr_matrix.loc['intangibleAssets_change']
price_perf_per_yr 0.098838 price_perf 0.104331 Name: intangibleAssets_change, dtype: float64
fig = px.scatter(chng_df,
x='totalLiabilities_change',
y='price_perf',
color='class',
color_continuous_scale='Burg',
trendline='ols')
fig.show()
chng_df_tl = chng_df.copy()
chng_df_tl = chng_df_tl[(chng_df_tl['totalLiabilities_change'] < 8)]
fig = px.scatter(chng_df_tl,
x='totalLiabilities_change',
y='price_perf',
color='class',
color_continuous_scale='Burg',
trendline='ols')
fig.show()
corr_matrix = chng_df_tl.corr()
corr_matrix = corr_matrix[['price_perf_per_yr','price_perf']].sort_values(by='price_perf')
corr_matrix = corr_matrix[(corr_matrix['price_perf_per_yr']>.1) | (corr_matrix['price_perf_per_yr']<.1)]
corr_matrix = corr_matrix[(corr_matrix['price_perf']>.1) | (corr_matrix['price_perf']<=.1)]
corr_matrix.loc['totalLiabilities_change']
price_perf_per_yr 0.235417 price_perf 0.239063 Name: totalLiabilities_change, dtype: float64
outlier_cols = list(chng_df.columns.values)[8:44]
chng_df[outlier_cols] = chng_df[outlier_cols][chng_df[outlier_cols].apply(lambda x: np.abs(x - x.mean()) / x.std() < 3)]
corr_matrix = chng_df.corr()
corr_matrix[['price_perf_per_yr','price_perf']].sort_values(by='price_perf').dropna()
| price_perf_per_yr | price_perf | |
|---|---|---|
| start_price | -0.025686 | -0.024473 |
| retainedEarnings_change | -0.039455 | -0.024081 |
| otherCurrentLiabilities_change | -0.005725 | -0.006540 |
| duration | -0.032623 | -0.002838 |
| cashAndShortTermInvestments_change | 0.004590 | 0.004792 |
| totalCurrentLiabilities_change | 0.011005 | 0.009611 |
| commonStockSharesOutstanding_change | 0.011437 | 0.011665 |
| cashAndCashEquivalentsAtCarryingValue_change | 0.038228 | 0.034960 |
| end_price | 0.079360 | 0.079489 |
| otherNonCurrentLiabilities_change | 0.100154 | 0.101999 |
| accumulatedDepreciationAmortizationPPE_change | 0.103655 | 0.104565 |
| capitalLeaseObligations_change | 0.102294 | 0.106313 |
| propertyPlantEquipment_change | 0.149649 | 0.138731 |
| currentNetReceivables_change | 0.138442 | 0.138979 |
| totalNonCurrentLiabilities_change | 0.152901 | 0.152976 |
| totalCurrentAssets_change | 0.186099 | 0.181184 |
| otherNonCurrrentAssets_change | 0.195387 | 0.199191 |
| inventory_change | 0.214552 | 0.212653 |
| currentAccountsPayable_change | 0.224870 | 0.225654 |
| totalLiabilities_change | 0.245346 | 0.248904 |
| class | 0.457207 | 0.446905 |
| totalAssets_change | 0.582800 | 0.565028 |
| price_perf_per_yr | 1.000000 | 0.997559 |
| price_perf | 0.997559 | 1.000000 |
fig = px.scatter(chng_df,
x='totalAssets_change',
y='price_perf',
color='class',
color_continuous_scale='Burg',
trendline='ols')
fig.show()
chng_df_ta = chng_df.copy()
chng_df_ta = chng_df_ta[(chng_df_ta['totalAssets_change'] < 4)]
fig = px.scatter(chng_df_ta,
x='totalAssets_change',
y='price_perf',
color='class',
color_continuous_scale='Burg',
trendline='ols')
fig.show()
chng_df.shape
(461, 45)
cols_to_delete = chng_df.columns[chng_df.isnull().sum()/len(chng_df) > .20]
chng_df.drop(cols_to_delete, axis = 1, inplace = True)
chng_df = chng_df[chng_df.columns[~chng_df.isnull().all()]]
chng_df.shape
(461, 22)
chng_df = chng_df.dropna()
chng_df.shape
(396, 22)
chng_df = chng_df.dropna(axis=1, how='all')
chng_df.columns[8:22]
print(chng_df.columns[8:21])
X = chng_df.iloc[:,8:21]
y = chng_df['class']
Index(['totalAssets_change', 'totalCurrentAssets_change',
'cashAndCashEquivalentsAtCarryingValue_change',
'cashAndShortTermInvestments_change', 'propertyPlantEquipment_change',
'otherNonCurrrentAssets_change', 'totalLiabilities_change',
'totalCurrentLiabilities_change', 'totalNonCurrentLiabilities_change',
'otherCurrentLiabilities_change', 'otherNonCurrentLiabilities_change',
'retainedEarnings_change', 'commonStockSharesOutstanding_change'],
dtype='object')
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
X = chng_df.iloc[:,8:21]
y = chng_df['class']
model = ExtraTreesClassifier()
model.fit(X,y)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
print(model.feature_importances_)
fig = px.bar(feat_importances.nlargest(10))
fig.show()
[0.10711434 0.0596032 0.06531938 0.06224042 0.07392545 0.06769709 0.08191968 0.07383749 0.07865531 0.08634442 0.06375282 0.10400966 0.07558074]
chng_df_re = chng_df.copy()
fig = px.scatter(chng_df_re,
x='retainedEarnings_change',
y='price_perf',
color='class',
color_continuous_scale='Burg',
trendline='ols')
fig.show()
fig = px.scatter(chng_df_re,
x='totalAssets_change',
y='retainedEarnings_change',
opacity=.5,
color='class')
fig.show()
fig = px.scatter(chng_df_re,
x='totalAssets_change',
y='totalLiabilities_change',
opacity=.5,
color='class')
fig.show()
fig = px.scatter(chng_df_re,
x='retainedEarnings_change',
y='totalLiabilities_change',
opacity=.5,
color='class')
fig.show()
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
X = chng_df[['totalAssets_change',
'retainedEarnings_change',
'propertyPlantEquipment_change',
'totalLiabilities_change']]
y = chng_df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)
clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
print(clf.score(X_test, y_test))
fig = px.scatter(X_test,
x='retainedEarnings_change',
y='totalLiabilities_change',
opacity=.5,
color=y_hat)
fig.show()
0.7169811320754716
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)
print(clf.score(X_test, y_test))
y_hat = clf.predict(X_test)
fig = px.scatter(X_test,
x='retainedEarnings_change',
y='totalLiabilities_change',
opacity=.5,
color=y_test,
title='Actual')
fig.show()
fig = px.scatter(X_test,
x='retainedEarnings_change',
y='totalLiabilities_change',
opacity=.5,
color=y_hat,
title='Predicted Using Gradient Boosted Trees Classifier')
fig.show()
0.7672955974842768
Features to monitor for future stock selections based on above analysis include: